This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_500/notebooks/PE_500_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b6b80d07310>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
3
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 385 | 385 | Prevotella fusca |
| 389 | 389 | Prevotella oris |
| 831 | 831 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
1
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 1237 | 1237 | [Ruminococcus] gnavus |
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b6b80ffca90>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b6b8108b050>]
PCA_full.explained_variance_ratio_[0:10]
array([0.32007816, 0.25488517, 0.21651208, 0.2085245 ], dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_1K/notebooks/PE_1K_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b4a77e0e8d0>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
5
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 32 | 32 | Bacteroides sp. CACC 737 |
| 565 | 565 | Prevotella melaninogenica |
| 566 | 566 | Prevotella oris |
| 567 | 567 | Prevotella ruminicola |
| 1192 | 1192 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
2
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 259 | 259 | Blautia sp. SC05B48 |
| 1760 | 1760 | [Ruminococcus] gnavus |
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b4aac4c4b90>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b4aac5506d0>]
PCA_full.explained_variance_ratio_[0:10]
array([0.3367226 , 0.19552425, 0.14681527, 0.1091447 , 0.10514598,
0.06142709, 0.04522003], dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_5K/notebooks/PE_5K_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b7f46901590>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
15
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 52 | 52 | Bacteroides sp. CACC 737 |
| 218 | 218 | Prevotella sp. oral taxon 299 |
| 416 | 416 | Bacteroides zoogleoformans |
| 951 | 951 | Phocaeicola salanitronis |
| 977 | 977 | Prevotella dentalis |
| 978 | 978 | Prevotella denticola |
| 979 | 979 | Prevotella enoeca |
| 980 | 980 | Prevotella fusca |
| 981 | 981 | Prevotella intermedia |
| 982 | 982 | Prevotella jejuni |
| 983 | 983 | Prevotella melaninogenica |
| 984 | 984 | Prevotella oris |
| 985 | 985 | Prevotella ruminicola |
| 986 | 986 | Prevotella scopos |
| 2188 | 2188 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
14
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 423 | 423 | Blautia sp. SC05B48 |
| 1683 | 1683 | Clostridioides difficile |
| 1934 | 1934 | Enterococcus faecium |
| 1964 | 1964 | Eubacterium limosum |
| 2031 | 2031 | Gordonibacter pamelaeae |
| 2186 | 2186 | Lachnoclostridium phocaeense |
| 1106 | 1106 | Streptococcus anginosus |
| 3109 | 3109 | Streptococcus cristatus |
| 3110 | 3110 | Streptococcus equinus |
| 3137 | 3137 | Streptococcus pneumoniae |
| 3145 | 3145 | Streptococcus salivarius |
| 3377 | 3377 | [Clostridium] hylemonae |
| 3379 | 3379 | [Clostridium] scindens |
| 3381 | 3381 | [Ruminococcus] gnavus |
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b7f46c36a50>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b7f46cb97d0>]
PCA_full.explained_variance_ratio_[0:10]
array([0.28482836, 0.13394621, 0.07766104, 0.04418063, 0.03974501,
0.03701495, 0.03462601, 0.03136414, 0.02942527, 0.0261827 ],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_10K/notebooks/PE_10K_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b1bddca6ed0>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
18
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 30 | 30 | Alloprevotella sp. E39 |
| 52 | 52 | Bacteroides sp. A1C1 |
| 53 | 53 | Bacteroides sp. CACC 737 |
| 236 | 236 | Prevotella sp. oral taxon 299 |
| 524 | 524 | Chitinophaga pinensis |
| 851 | 851 | Maribacter cobaltidurans |
| 1082 | 1082 | Phocaeicola salanitronis |
| 1108 | 1108 | Prevotella dentalis |
| 1109 | 1109 | Prevotella denticola |
| 1110 | 1110 | Prevotella enoeca |
| 1111 | 1111 | Prevotella fusca |
| 1112 | 1112 | Prevotella intermedia |
| 1113 | 1113 | Prevotella jejuni |
| 1114 | 1114 | Prevotella melaninogenica |
| 1115 | 1115 | Prevotella oris |
| 1116 | 1116 | Prevotella ruminicola |
| 1117 | 1117 | Prevotella scopos |
| 2546 | 2546 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
33
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 1546 | 1546 | Actinomyces pacaensis |
| 1727 | 1727 | Blautia argi |
| 456 | 456 | Blautia sp. SC05B48 |
| 1963 | 1963 | Clostridioides difficile |
| 2236 | 2236 | Enterocloster bolteae |
| 2243 | 2243 | Enterococcus faecium |
| 2277 | 2277 | Eubacterium limosum |
| 648 | 648 | Eubacterium sp. NSJ-61 |
| 2287 | 2287 | Faecalitalea cylindroides |
| 2544 | 2544 | Lachnoclostridium phocaeense |
| 2549 | 2549 | Lacrimispora sphenoides |
| 2633 | 2633 | Lancefieldella parvula |
| 5048 | 5048 | Raoultella ornithinolytica |
| 3514 | 3514 | Schaalia odontolytica |
| 1269 | 1269 | Streptococcus anginosus |
| 3691 | 3691 | Streptococcus australis |
| 3694 | 3694 | Streptococcus equinus |
| 3698 | 3698 | Streptococcus gwangjuense |
| 3709 | 3709 | Streptococcus milleri |
| 3710 | 3710 | Streptococcus mitis |
| 3712 | 3712 | Streptococcus oralis |
| 3714 | 3714 | Streptococcus parasanguinis |
| 3722 | 3722 | Streptococcus pneumoniae |
| 3724 | 3724 | Streptococcus pseudopneumoniae |
| 3730 | 3730 | Streptococcus salivarius |
| 1282 | 1282 | Streptococcus sp. FDAARGOS_192 |
| 1286 | 1286 | Streptococcus sp. HSISS1 |
| 1288 | 1288 | Streptococcus sp. HSISS3 |
| 1297 | 1297 | Streptococcus sp. oral taxon 061 |
| 1299 | 1299 | Streptococcus sp. oral taxon 431 |
| 4024 | 4024 | [Clostridium] scindens |
| 4025 | 4025 | [Eubacterium] cellulosolvens |
| 4028 | 4028 | [Ruminococcus] gnavus |
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b1bde00d810>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b1bde085f50>]
PCA_full.explained_variance_ratio_[0:10]
array([0.2345628 , 0.17777613, 0.07285864, 0.04906721, 0.03577022,
0.02678402, 0.02412703, 0.02319624, 0.02202586, 0.02043021],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_25K/notebooks/PE_25K_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b749a04b190>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
20
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 20 | 33 | Alloprevotella sp. E39 |
| 38 | 56 | Bacteroides sp. A1C1 |
| 39 | 57 | Bacteroides sp. CACC 737 |
| 158 | 248 | Prevotella sp. oral taxon 299 |
| 296 | 483 | Bacteroides zoogleoformans |
| 297 | 484 | Barnesiella viscericola |
| 378 | 668 | Duncaniella dubosii |
| 547 | 1033 | Muribaculum intestinale |
| 632 | 1209 | Phocaeicola salanitronis |
| 649 | 1237 | Prevotella dentalis |
| 650 | 1238 | Prevotella denticola |
| 651 | 1239 | Prevotella enoeca |
| 652 | 1240 | Prevotella fusca |
| 653 | 1241 | Prevotella intermedia |
| 654 | 1242 | Prevotella jejuni |
| 655 | 1243 | Prevotella melaninogenica |
| 656 | 1244 | Prevotella oris |
| 657 | 1245 | Prevotella ruminicola |
| 658 | 1246 | Prevotella scopos |
| 1350 | 2872 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
53
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 836 | 1747 | Actinomyces pacaensis |
| 873 | 1839 | Anaerotignum propionicum |
| 936 | 1943 | Blautia argi |
| 302 | 492 | Blautia sp. SC05B48 |
| 321 | 551 | Caproiciproducens sp. NJN-50 |
| 1020 | 2212 | Clostridioides difficile |
| 1058 | 2257 | Collinsella aerofaciens |
| 384 | 674 | Eggerthella sp. YY7918 |
| 1182 | 2519 | Enterocloster bolteae |
| 1189 | 2526 | Enterococcus faecium |
| 1209 | 2568 | Eubacterium callanderi |
| 1210 | 2569 | Eubacterium limosum |
| 407 | 717 | Eubacterium sp. NSJ-61 |
| 1220 | 2579 | Faecalitalea cylindroides |
| 446 | 767 | Gemella haemolysans |
| 0 | 0 | Homo sapiens |
| 1347 | 2869 | Lachnoanaerobaculum umeaense |
| 1348 | 2870 | Lachnoclostridium phocaeense |
| 1416 | 2962 | Lancefieldella parvula |
| 535 | 1018 | Mogibacterium diversum |
| 1570 | 3480 | Olsenella uli |
| 1730 | 3903 | Rothia mucilaginosa |
| 1735 | 3918 | Ruthenibacterium lactatiformans |
| 1752 | 3960 | Schaalia meyeri |
| 1753 | 3961 | Schaalia odontolytica |
| 723 | 1430 | Streptococcus anginosus |
| 1819 | 4159 | Streptococcus australis |
| 1821 | 4161 | Streptococcus cristatus |
| 1822 | 4162 | Streptococcus equinus |
| 1826 | 4166 | Streptococcus gwangjuense |
| 727 | 1434 | Streptococcus intermedius |
| 1833 | 4173 | Streptococcus lutetiensis |
| 1837 | 4177 | Streptococcus milleri |
| 1838 | 4178 | Streptococcus mitis |
| 1840 | 4180 | Streptococcus oralis |
| 1842 | 4182 | Streptococcus parasanguinis |
| 1847 | 4196 | Streptococcus pneumoniae |
| 1855 | 4204 | Streptococcus salivarius |
| 731 | 1439 | Streptococcus sp. A12 |
| 735 | 1444 | Streptococcus sp. FDAARGOS_192 |
| 736 | 1447 | Streptococcus sp. HSISM1 |
| 737 | 1448 | Streptococcus sp. HSISS1 |
| 738 | 1449 | Streptococcus sp. HSISS2 |
| 739 | 1450 | Streptococcus sp. HSISS3 |
| 742 | 1453 | Streptococcus sp. KS 6 |
| 748 | 1459 | Streptococcus sp. oral taxon 061 |
| 750 | 1461 | Streptococcus sp. oral taxon 431 |
| 1859 | 4208 | Streptococcus thermophilus |
| 1970 | 4541 | [Clostridium] hylemonae |
| 1971 | 4542 | [Clostridium] innocuum |
| 1972 | 4543 | [Clostridium] scindens |
| 1973 | 4544 | [Eubacterium] cellulosolvens |
| 1974 | 4547 | [Ruminococcus] gnavus |
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b749a331050>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b749b043490>]
PCA_full.explained_variance_ratio_[0:10]
array([0.22760314, 0.14726828, 0.0759825 , 0.04990283, 0.03358746,
0.02806716, 0.02557435, 0.02294869, 0.02007085, 0.0189527 ],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_50K/notebooks/PE_50K_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b06cd452fd0>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
27
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 11 | 33 | Alloprevotella sp. E39 |
| 22 | 59 | Bacteroides sp. A1C1 |
| 23 | 60 | Bacteroides sp. CACC 737 |
| 25 | 62 | Bacteroides sp. HF-162 |
| 80 | 217 | Muribaculum sp. TLL-A4 |
| 93 | 260 | Prevotella sp. oral taxon 299 |
| 109 | 300 | Tannerella sp. oral taxon HOT-286 |
| 129 | 362 | Alistipes megaguti |
| 171 | 500 | Bacteroides coprosuis |
| 181 | 510 | Bacteroides zoogleoformans |
| 182 | 511 | Barnesiella viscericola |
| 249 | 712 | Duncaniella dubosii |
| 349 | 988 | Labilibaculum antarcticum |
| 379 | 1116 | Muribaculum intestinale |
| 419 | 1276 | Paludibacter propionicigenes |
| 435 | 1295 | Phocaeicola salanitronis |
| 451 | 1325 | Prevotella dentalis |
| 452 | 1326 | Prevotella denticola |
| 453 | 1327 | Prevotella enoeca |
| 454 | 1328 | Prevotella fusca |
| 455 | 1329 | Prevotella intermedia |
| 456 | 1330 | Prevotella jejuni |
| 457 | 1331 | Prevotella melaninogenica |
| 458 | 1332 | Prevotella oris |
| 459 | 1333 | Prevotella ruminicola |
| 460 | 1334 | Prevotella scopos |
| 914 | 3086 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
78
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 544 | 1824 | Abiotrophia defectiva |
| 563 | 1902 | Actinomyces pacaensis |
| 136 | 377 | Anaerocolumna sp. CBA3638 |
| 591 | 1997 | Anaerostipes rhamnosivorans |
| 592 | 1998 | Anaerotignum propionicum |
| ... | ... | ... |
| 1288 | 4843 | [Clostridium] hylemonae |
| 1289 | 4844 | [Clostridium] innocuum |
| 1290 | 4845 | [Clostridium] scindens |
| 1291 | 4846 | [Eubacterium] cellulosolvens |
| 1292 | 4849 | [Ruminococcus] gnavus |
78 rows × 2 columns
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b06ce3fa910>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b06ce437150>]
PCA_full.explained_variance_ratio_[0:10]
array([0.22097564, 0.12370215, 0.06650722, 0.04725637, 0.03010586,
0.0276116 , 0.02304265, 0.02046448, 0.01962958, 0.01692695],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_100K/notebooks/PE_100K_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b60423b61d0>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
26
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 11 | 36 | Alloprevotella sp. E39 |
| 17 | 64 | Bacteroides sp. A1C1 |
| 18 | 65 | Bacteroides sp. CACC 737 |
| 19 | 66 | Bacteroides sp. CBA7301 |
| 20 | 67 | Bacteroides sp. HF-162 |
| 54 | 225 | Muribaculum sp. TLL-A4 |
| 65 | 268 | Prevotella sp. oral taxon 299 |
| 73 | 307 | Tannerella sp. oral taxon HOT-286 |
| 88 | 374 | Alistipes megaguti |
| 121 | 530 | Bacteroides uniformis |
| 124 | 533 | Barnesiella viscericola |
| 166 | 745 | Duncaniella dubosii |
| 262 | 1186 | Mucinivorans hirudinis |
| 263 | 1189 | Muribaculum intestinale |
| 302 | 1379 | Phocaeicola salanitronis |
| 313 | 1411 | Prevotella dentalis |
| 314 | 1412 | Prevotella denticola |
| 315 | 1413 | Prevotella enoeca |
| 316 | 1414 | Prevotella fusca |
| 317 | 1415 | Prevotella intermedia |
| 318 | 1416 | Prevotella jejuni |
| 319 | 1417 | Prevotella melaninogenica |
| 320 | 1418 | Prevotella oris |
| 321 | 1419 | Prevotella ruminicola |
| 322 | 1420 | Prevotella scopos |
| 671 | 3261 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
82
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 393 | 1940 | Abiotrophia defectiva |
| 405 | 2020 | Actinomyces pacaensis |
| 93 | 388 | Anaerocolumna sp. CBA3638 |
| 423 | 2124 | Anaerostipes rhamnosivorans |
| 449 | 2228 | Bifidobacterium longum |
| ... | ... | ... |
| 934 | 5099 | [Clostridium] hylemonae |
| 935 | 5100 | [Clostridium] innocuum |
| 936 | 5101 | [Clostridium] scindens |
| 937 | 5102 | [Eubacterium] cellulosolvens |
| 938 | 5105 | [Ruminococcus] gnavus |
82 rows × 2 columns
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b6043340410>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b60433b5c10>]
PCA_full.explained_variance_ratio_[0:10]
array([0.2255645 , 0.12491108, 0.0727939 , 0.04617774, 0.03180021,
0.02819338, 0.02619162, 0.0204767 , 0.01948499, 0.01763883],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_500K/notebooks/PE_500K_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b652324e510>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
30
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 5 | 45 | Alloprevotella sp. E39 |
| 8 | 76 | Bacteroides sp. A1C1 |
| 9 | 77 | Bacteroides sp. CACC 737 |
| 10 | 78 | Bacteroides sp. CBA7301 |
| 11 | 79 | Bacteroides sp. HF-162 |
| 21 | 252 | Muribaculum sp. TLL-A4 |
| 25 | 297 | Prevotella sp. oral taxon 299 |
| 28 | 345 | Tannerella sp. oral taxon HOT-286 |
| 39 | 422 | Alistipes megaguti |
| 54 | 585 | Bacteroides coprosuis |
| 62 | 593 | Bacteroides uniformis |
| 64 | 595 | Bacteroides zoogleoformans |
| 65 | 596 | Barnesiella viscericola |
| 95 | 837 | Duncaniella dubosii |
| 170 | 1359 | Mucinivorans hirudinis |
| 171 | 1363 | Muribaculum intestinale |
| 196 | 1593 | Phocaeicola salanitronis |
| 203 | 1624 | Porphyromonas gingivalis |
| 204 | 1626 | Prevotella dentalis |
| 205 | 1627 | Prevotella denticola |
| 206 | 1628 | Prevotella enoeca |
| 207 | 1629 | Prevotella fusca |
| 208 | 1630 | Prevotella intermedia |
| 209 | 1631 | Prevotella jejuni |
| 210 | 1632 | Prevotella melaninogenica |
| 211 | 1633 | Prevotella oris |
| 212 | 1634 | Prevotella ruminicola |
| 213 | 1635 | Prevotella scopos |
| 261 | 2143 | Tannerella forsythia |
| 484 | 3717 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
109
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 266 | 2246 | Abiotrophia defectiva |
| 273 | 2331 | Actinomyces pacaensis |
| 30 | 377 | Actinomyces sp. oral taxon 169 |
| 275 | 2347 | Acutalibacter muris |
| 7 | 48 | Aminipila sp. JN-18 |
| ... | ... | ... |
| 690 | 5760 | [Clostridium] hylemonae |
| 691 | 5761 | [Clostridium] innocuum |
| 692 | 5762 | [Clostridium] scindens |
| 693 | 5763 | [Eubacterium] cellulosolvens |
| 694 | 5766 | [Ruminococcus] gnavus |
109 rows × 2 columns
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b6524b36910>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b6524bbe910>]
PCA_full.explained_variance_ratio_[0:10]
array([0.20437433, 0.10939611, 0.0640564 , 0.04235681, 0.03229447,
0.02835214, 0.02383311, 0.0211975 , 0.02011027, 0.01861927],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_1M/notebooks/PE_1M_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b05632db390>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
31
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 5 | 50 | Alloprevotella sp. E39 |
| 8 | 82 | Bacteroides sp. A1C1 |
| 9 | 83 | Bacteroides sp. CACC 737 |
| 10 | 84 | Bacteroides sp. CBA7301 |
| 11 | 85 | Bacteroides sp. HF-162 |
| 22 | 266 | Muribaculum sp. TLL-A4 |
| 25 | 312 | Prevotella sp. oral taxon 299 |
| 27 | 361 | Tannerella sp. oral taxon HOT-286 |
| 38 | 444 | Alistipes megaguti |
| 53 | 625 | Bacteroides coprosuis |
| 61 | 633 | Bacteroides uniformis |
| 63 | 635 | Bacteroides zoogleoformans |
| 64 | 638 | Barnesiella viscericola |
| 94 | 897 | Duncaniella dubosii |
| 168 | 1476 | Muribaculum intestinale |
| 182 | 1693 | Paludibacter propionicigenes |
| 191 | 1723 | Phocaeicola salanitronis |
| 197 | 1755 | Porphyromonas gingivalis |
| 198 | 1758 | Prevotella dentalis |
| 199 | 1759 | Prevotella denticola |
| 200 | 1760 | Prevotella enoeca |
| 201 | 1761 | Prevotella fusca |
| 202 | 1762 | Prevotella intermedia |
| 203 | 1763 | Prevotella jejuni |
| 204 | 1764 | Prevotella melaninogenica |
| 205 | 1765 | Prevotella oris |
| 206 | 1766 | Prevotella ruminicola |
| 207 | 1767 | Prevotella scopos |
| 208 | 1778 | Proteiniphilum saccharofermentans |
| 255 | 2315 | Tannerella forsythia |
| 474 | 4009 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
109
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 259 | 2431 | Abiotrophia defectiva |
| 266 | 2520 | Actinomyces pacaensis |
| 268 | 2536 | Acutalibacter muris |
| 7 | 53 | Aminipila sp. JN-18 |
| 42 | 461 | Anaerocolumna sp. CBA3638 |
| ... | ... | ... |
| 679 | 6146 | [Clostridium] hylemonae |
| 680 | 6147 | [Clostridium] innocuum |
| 681 | 6148 | [Clostridium] scindens |
| 682 | 6149 | [Eubacterium] cellulosolvens |
| 683 | 6152 | [Ruminococcus] gnavus |
109 rows × 2 columns
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b0564277210>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b05642ea9d0>]
PCA_full.explained_variance_ratio_[0:10]
array([0.2054042 , 0.11034953, 0.06406517, 0.04589558, 0.03303425,
0.02962623, 0.02517694, 0.02215339, 0.02087491, 0.01917927],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_5M/notebooks/PE_5M_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2aac67868490>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
30
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 4 | 64 | Alloprevotella sp. E39 |
| 8 | 99 | Bacteroides sp. A1C1 |
| 9 | 100 | Bacteroides sp. CACC 737 |
| 10 | 101 | Bacteroides sp. CBA7301 |
| 11 | 102 | Bacteroides sp. HF-162 |
| 22 | 305 | Muribaculum sp. TLL-A4 |
| 26 | 355 | Prevotella sp. oral taxon 299 |
| 27 | 413 | Tannerella sp. oral taxon HOT-286 |
| 38 | 509 | Alistipes megaguti |
| 53 | 718 | Bacteroides coprosuis |
| 61 | 726 | Bacteroides uniformis |
| 63 | 728 | Bacteroides zoogleoformans |
| 64 | 731 | Barnesiella viscericola |
| 94 | 1039 | Duncaniella dubosii |
| 164 | 1718 | Mucinivorans hirudinis |
| 165 | 1722 | Muribaculum intestinale |
| 189 | 2016 | Phocaeicola salanitronis |
| 195 | 2049 | Porphyromonas gingivalis |
| 196 | 2052 | Prevotella dentalis |
| 197 | 2053 | Prevotella denticola |
| 198 | 2054 | Prevotella enoeca |
| 199 | 2055 | Prevotella fusca |
| 200 | 2056 | Prevotella intermedia |
| 201 | 2057 | Prevotella jejuni |
| 202 | 2058 | Prevotella melaninogenica |
| 203 | 2059 | Prevotella oris |
| 204 | 2060 | Prevotella ruminicola |
| 205 | 2061 | Prevotella scopos |
| 464 | 4589 | Lachnospira eligens |
| 716 | 7592 | Glaciecola amylolytica |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
111
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 252 | 2822 | Abiotrophia defectiva |
| 259 | 2925 | Actinomyces pacaensis |
| 29 | 457 | Actinomyces sp. oral taxon 169 |
| 261 | 2941 | Acutalibacter muris |
| 6 | 67 | Aminipila sp. JN-18 |
| ... | ... | ... |
| 664 | 7012 | [Clostridium] hylemonae |
| 665 | 7013 | [Clostridium] innocuum |
| 666 | 7014 | [Clostridium] scindens |
| 667 | 7015 | [Eubacterium] cellulosolvens |
| 668 | 7018 | [Ruminococcus] gnavus |
111 rows × 2 columns
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2aac67b3c650>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2aac6888ca10>]
PCA_full.explained_variance_ratio_[0:10]
array([0.20483352, 0.1129159 , 0.06683698, 0.0446279 , 0.03363062,
0.02847929, 0.02470426, 0.02361299, 0.01978293, 0.01735257],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
This notebook in intended as a generic notebook to be used with the papermill python library to allow automated generation of analyses and reports for classifiers on microbiome data generated by kraken2 pipeline. Here we do differential expression and PCA on the full dataset.
cd /project/src
[Errno 2] No such file or directory: '/project/src' /project/6011811/data/microbiome_OJS/workflow
from sklearn import model_selection
from sklearn import metrics
import os
import re
import copy
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, '/project/6011811/data/microbiome_OJS/workflow/src/')
from MicroBiome import MicroBiomeDataSet, Trainer, TrainTester, MultiTrainTester, list_transformer, DiffExpTransform
from ScoreFunctions import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model as LM
import seaborn as sns
import pickle as pk
from matplotlib import pyplot as plt
# Ignore warning messages
if True:
import warnings
warnings.filterwarnings('ignore')
input_dir = '/project/data/preprocessed/PE_50K_sex_complete'
fdr_DE = 0.05
# Parameters
input_dir = "results/kraken2_PE_10M/notebooks/PE_10M_Sex/prepped_data"
fdr_DE = 0.05
os.listdir(input_dir)
['meta_data_mat.pk', 'metadata_samples_keep.csv', 'y.pk', 'feat_meta.csv', 'X.pk']
infile_X = open(os.path.join(input_dir, 'X.pk'),'rb')
X = pk.load(infile_X)
infile_X.close()
infile_y = open(os.path.join(input_dir, 'y.pk'),'rb')
y = pk.load(infile_y)
infile_y.close()
# infile_meta_data_mat = open(os.path.join(input_dir, 'meta_data_mat.pk'), 'rb')
# meta_data_mat = pk.load(infile_meta_data_mat)
# infile_meta_data_mat.close()
# model input
# X_inp = np.concatenate([X, meta_data_mat], axis=1)
sample_meta = pd.read_csv(os.path.join(input_dir, 'metadata_samples_keep.csv'))
feat_meta = pd.read_csv(os.path.join(input_dir, 'feat_meta.csv'))
X_inp = X
Note that tests performed are 1 sided t-tests for each class. If we really want we can do FDR correction based off of union of hypotheses tested for each class, but for now, implementation is FDR correction only for 1 set of class hypotheses at a time (i.e. separately apply FDR correction for each class)
DE = DiffExpTransform()
DE.fit(X, y)
<MicroBiome.DiffExpTransform at 0x2b31db9f7bd0>
# ACVD Negative Associated Features
DE.results['0']['neg_log10_p'] = -(np.log10(DE.results['0']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['0'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['0'][DE.results['0']['rejected'] == True].shape[0]
28
feat_meta.iloc[np.argwhere((DE.results['0']['rejected'] == True).to_numpy()).flatten(), :]
| Unnamed: 0 | feature | |
|---|---|---|
| 4 | 73 | Alloprevotella sp. E39 |
| 8 | 111 | Bacteroides sp. A1C1 |
| 9 | 112 | Bacteroides sp. CACC 737 |
| 10 | 113 | Bacteroides sp. CBA7301 |
| 11 | 114 | Bacteroides sp. HF-162 |
| 21 | 328 | Muribaculum sp. TLL-A4 |
| 24 | 378 | Prevotella sp. oral taxon 299 |
| 25 | 441 | Tannerella sp. oral taxon HOT-286 |
| 36 | 542 | Alistipes megaguti |
| 51 | 761 | Bacteroides coprosuis |
| 59 | 769 | Bacteroides uniformis |
| 61 | 771 | Bacteroides zoogleoformans |
| 62 | 775 | Barnesiella viscericola |
| 93 | 1104 | Duncaniella dubosii |
| 164 | 1834 | Muribaculum intestinale |
| 188 | 2150 | Phocaeicola salanitronis |
| 194 | 2185 | Porphyromonas gingivalis |
| 195 | 2188 | Prevotella dentalis |
| 196 | 2189 | Prevotella denticola |
| 197 | 2190 | Prevotella enoeca |
| 198 | 2191 | Prevotella fusca |
| 199 | 2192 | Prevotella intermedia |
| 200 | 2193 | Prevotella jejuni |
| 201 | 2194 | Prevotella melaninogenica |
| 202 | 2195 | Prevotella oris |
| 203 | 2196 | Prevotella ruminicola |
| 204 | 2197 | Prevotella scopos |
| 462 | 4840 | Lachnospira eligens |
# ACVD Negative Associated Features
DE.results['1']['neg_log10_p'] = -(np.log10(DE.results['1']['pval'].to_numpy()))
sns.scatterplot(data=DE.results['1'], x='tstat', y='neg_log10_p', hue='rejected')
<AxesSubplot:xlabel='tstat', ylabel='neg_log10_p'>
DE.results['1'][DE.results['1']['rejected'] == True].shape[0]
112
feat_meta.iloc[np.argwhere((DE.results['1']['rejected'] == True).to_numpy()).flatten(), :].sort_values('feature')
| Unnamed: 0 | feature | |
|---|---|---|
| 253 | 3008 | Abiotrophia defectiva |
| 260 | 3114 | Actinomyces pacaensis |
| 27 | 487 | Actinomyces sp. oral taxon 169 |
| 6 | 76 | Aminipila sp. JN-18 |
| 270 | 3236 | Anaerocolumna cellulosilytica |
| ... | ... | ... |
| 665 | 7357 | [Clostridium] hylemonae |
| 666 | 7358 | [Clostridium] innocuum |
| 667 | 7359 | [Clostridium] scindens |
| 668 | 7360 | [Eubacterium] cellulosolvens |
| 669 | 7364 | [Ruminococcus] gnavus |
112 rows × 2 columns
std_scaler_full = StandardScaler()
X_scaled_full = std_scaler_full.fit_transform(DE.transform(X))
PCA_full = PCA()
X_PCA_full = PCA_full.fit_transform(X_scaled_full)
plt.plot(PCA_full.explained_variance_ratio_[0:100])
[<matplotlib.lines.Line2D at 0x2b31dcba5f10>]
plt.plot(PCA_full.explained_variance_ratio_[0:10])
[<matplotlib.lines.Line2D at 0x2b31dcc08b90>]
PCA_full.explained_variance_ratio_[0:10]
array([0.20572992, 0.10992485, 0.06751885, 0.04557551, 0.03401557,
0.03030768, 0.02428878, 0.02370493, 0.01932176, 0.01810731],
dtype=float32)
PCA_plot_df = pd.DataFrame(X_PCA_full, columns = np.array(['PC{}'.format(i) for i in range(1, X_PCA_full.shape[1] + 1)]))
PCA_plot_df['ACVD status'] = y
sns.scatterplot(data=PCA_plot_df, x='PC1', y='PC2', hue='ACVD status')
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
sns.scatterplot(data=PCA_plot_df, x='PC2', y='PC3', hue='ACVD status')
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
sns.scatterplot(data=PCA_plot_df, x='PC3', y='PC4', hue='ACVD status')
<AxesSubplot:xlabel='PC3', ylabel='PC4'>